# Code to produce graphs of scope for fishing given different degrees of freedom
# Macartan Humphreys, Raul Sanchez de la Sierra, and Peter van der Windt 
#
# Contents
# A. Data generating functions
# B. Fishing  functions
# C. Other useful  functions
# D. Implementation of Simulations
# E. Graphing results
#


# A. Data generating functions
#################################################################################

# Function to produce orthogonal x and y 
rand.xy = function(n){data.frame(y=rnorm(n), x = rnorm(n))} 
	# illustrate
	cor(rand.xy(5))

orthog = function(n){
	y = rnorm(n)
	data.frame(y=y, x = lm(rnorm(n)~y)[[2]]) 
	} 
	# illustrate
	cor(orthog(4))	

orthogbin = function(n){
	y = (rank(runif(n))/n)>runif(1)
	x = lm(rnorm(n)~y)[[2]]  
	data.frame(y=y, x = x) 
	} 

rand.bin = function(n){
	y = (rank(runif(n))/n)>runif(1)
    if(mean(y)==0) y[1]=1
    if(mean(y)==1) y[1]=0
	data.frame(y=y, x = rnorm(n)) 
	} 

	
# B. Fishing  functions
#################################################################################
	
# 1. Probability of getting positive result given one covariate from set
one.cov = function(n, k, sims, cutoff=.05, equalsize=NULL, xy = orthog){
	p = matrix(NA, k, sims)
	for(j in 1:sims){
	data = xy(n)
	for(i in 1:k){
	m = lm(data$y ~ data$x + runif(nrow(data)))
	p[i, j]<-summary(m)[[4]][2,4]
	}}
	mean(apply(p<cutoff, 2, max))	
	}
# illustrate
# one.cov(10, 100, 20)	

# 2. Probability of getting positive result given up to v covariates from a set of k random covariates 
# note we only examine up to maximium v = n-3 covariates
many.fish = function(n, k, sims, cutoff=.05, equalsize=NULL, xy = orthog,v = n-3){	# here k is teh number of covariates available
	powerset = (t(perm(k))==1)[-1,]  # power set as TRUE/FALSE statements, excluding NO CONTROLS (first row) 
	powerset = powerset[rowSums(powerset)<=v,]  # Focus only on combinations that use v or fewer covariates
	sig = matrix(0, sims) # significance matrix
		for(i in 1:sims){
			data = xy(n)
			CONTROLS = matrix(rnorm(n*k), n, k)
			if(v==0){CONTROLS = matrix(1,n); powerset = matrix(1, 1, 1)}  # DEAL WITH SPECIAL CASE OF NO CONTROLS
			j = 1 
			while(j <= nrow(powerset)){
				m = lm(data$y ~ data$x + CONTROLS[, powerset[j,]]) 
				if(summary(m)[[4]][2,4]<cutoff) {
					sig[i] = 1
					j <- nrow(powerset) + 1}
				else {j <- j+1}
					}
	}
	mean(sig)
	}
# x=many.fish(10, 8, 200, cutoff=.05, equalsize=NULL)
# x

# 3. Probability of getting positive result given fishing for heterogeneous effects
# k ways of selecting subgroups for effects; each subgroup has at least 2 units
take.het = function(n, k, sims, cutoff=.05, equalsize = FALSE, xy = orthog){
	if(k>2^n) print("Note that k>2^n")	
	p = matrix(NA, 2*k, sims)
	for(j in 1:sims){
	data = xy(n)
	for(i in 1:k){
	if(equalsize==FALSE) {s= sample(2:(n-2),1)} else {s = round(n/2)}
	X = 1:n %in% sample(1:n, s)  # This creates 2 subgroups of random size of at least 2
	m1 = lm(data$y[X] ~ data$x[X])
	p[i, j]<-summary(m1)[[4]][2,4]
	m2 = lm(data$y[X==FALSE] ~ data$x[X==FALSE])
	p[k+i, j]<-summary(m2)[[4]][2,4]
	}}
	mean(apply(p<cutoff, 2, max, na.rm=TRUE))	
	}

# 4. Probability of getting positive result given fishing for cutoffs on Y
take.cut = function(n, k, sims, cutoff=.05, equalsize = FALSE, xy = orthog){
	p = matrix(NA, n, sims)
	for(j in 1:sims){
		data = xy(n)
		for(i in 2:(n-1)){
			y =  data$y>=(sort(data$y)[i])
			m = lm(y ~ data$x)
			p[i, j]<-summary(m)[[4]][2,4]
			}}
	p = p[c(-1, -nrow(p)),]	
	mean(apply(p<cutoff, 2, max), rm.na=T)	
	}
# take.cut(20, a, 100) 	
	
# 5. Probability of getting positive result given fishing for models (binary variable -- linear, logit, probit)
take.model = function(n, k, sims, cutoff=.05, equalsize = NULL, xy = orthogbin){
	p = matrix(NA, 3, sims)
	for(j in 1:sims){
	data = xy(n)
	y = data$y
	x = data$x
	m1 = lm(y ~ x)
	m2 = glm(formula = y ~ x, family = binomial(link = "logit"), na.action = na.pass)
	m3 = glm(formula = y ~ x, family = binomial(link = "probit"), na.action = na.pass)
	p[1, j]<-summary(m1)[[4]][2,4]
	p[2, j]<-summary(m2)[[12]][2,4]	
	p[3, j]<-summary(m3)[[12]][2,4]
	}
	mean(apply(p<cutoff, 2, max, na.rm=TRUE))	
	}
	
# take.model(20, 2, 100) 	


# C. Other useful  functions
#################################################################################

# permutations -- used for fishing for many covariates (Credit: Bernd Berber)
perm <- function(k) {
           t(sapply(1:k, function(x) {
           rep( c(rep(0, 2^k / 2^x), rep(1, 2^k / 2^x)), length.out=2^k)
           } ) ) }
# perm(3) # power set


# Code to generate a set of results for multiple values of k and n	
results = function(krange, nrange, sims, cutoff=.05, f = one.cov, equalsize=FALSE, xy = orthog){
	g = matrix(NA, length(krange), length(nrange))
	for(k in 1:length(krange)){for(n in 1:length(nrange)){
		print(paste("n=",nrange[n], ", k=",krange[k], sep=""))
		g[k,n] = f(nrange[n], krange[k], sims, cutoff=cutoff, equalsize=equalsize, xy = xy)	}}
	colnames(g)<-nrange
	rownames(g)<-krange
	g}	

# Function to plot results
plotmatrix = function(M, grlab="", sep=.05, xlab = "", ylab="", main="",  labs=TRUE, ylim = c(0,1)){
	X=as.numeric(rownames(M))
	md = floor(length(X)/2)
	if((length(X)%%2)) md<- .5+length(X)/2
	plot(X, M[,1], type="l", xlab=xlab, ylab=ylab, main=main, ylim = ylim)
	if(labs){text(X[md], M[,1][md]+sep,  paste(grlab, colnames(M)[1]))}
	for(j in 2:ncol(M)){lines(rownames(M), M[,j])
	if(labs)text(X[md], M[,j][md]+sep, paste(grlab, colnames(M)[j]))
	}}

	
# D. Implementation of Simulations
########################################################################################

# Note all n ranges and k ranges easily changed below

sims = 10 										# Increase to 2000+ for more reliable estimates	
nrange = c(10 , 15, 25, 50,  100, 150, 200)	
n = max(nrange)

# results take k then n
# 1 FISHING RESULTS
# POND SIZE
# SOME.CONF.RESULTS = results(c(10, 100, 1000), c(10,20, 40), sims, f = one.cov)	
# SOME.CONF.RESULTS
SOME.CONF.RESULTS2 = results(c(10, 100, 1000), nrange, sims, f = one.cov, xy=rand.xy)	
SOME.CONF.RESULTS2

# 2 MANY FISH -- THIS IS A SLOW ONE -- EXTREMELY SLOW WITH HIGHER K
# SOME.FISH.RESULTS = results(c(5, 10, 15), c(10,20, 40), sims, f = many.fish)	
# SOME.FISH.RESULTS
SOME.FISH.RESULTS2 = results(c(10, 12, 14), nrange, sims, f = many.fish, xy=rand.xy)	
SOME.FISH.RESULTS2

# 3 BIODIVERSITY: HETEROGENEOUS TREATMENT EFFECTS
SOME.HET.RESULTS = results(c(1, 10, 100), nrange, sims, f = take.het, xy=rand.xy)	
SOME.HET.RESULTS
SOME.HET.RESULTS2 = results(c(1, 10, 100), nrange, sims, f = take.het, equalsize=TRUE, xy=rand.xy)	
SOME.HET.RESULTS2

# 4 DEFINITION OF YS
# SOME.CUTS.RESULTS = results(c(1), c(10,50, 100, 200), sims, f = take.cut)	
# SOME.CUTS.RESULTS
SOME.CUTS.RESULTS2 = results(c(1), nrange, sims, f = take.cut, xy=rand.xy)	
SOME.CUTS.RESULTS2

# 5 Linear, logit, or probit?
# MODEL.RESULTS = take.model(10, NA, sims, xy = orthogbin) 
# MODEL.RESULTS 
# MODEL.RESULTS2 = take.model(10, NA, sims, xy=rand.bin) 
# MODEL.RESULTS2 
SOME.MOD.RESULTS = results(c(1), nrange, sims, f = take.model, xy = rand.bin)	
SOME.MOD.RESULTS


## E. Graph Results
###################################################################################################

png(file="Fishing.png", width=10, height=12, res=400, units="in") 
	par(mfrow=c(4,2))
	#k = 1:50
	#plot(k, 1-.95^k, main = "Probability of at least one significant result \n given k random measures to choose from", ylab = expression(paste("Scope for fishing = ",1-.95^m)), type="l")
	plot(nrange, rep(1-.95, length(nrange)), main = "Probability of at least one significant result \n given k random measures to choose from", ylab = expression(paste("Scope for fishing = ",1-.95^k)), type="l", ylim = c(0,1))
		text(n/2, 1-.95+.05, "k=1")
		for(i in seq(5,30,5)){lines(nrange, rep(1-.95^i, length(nrange))); text(n/2, 1-.95^i+.05, paste("k=", i))}
	plot(c(0),c(0), ann=F, axes=F, type = "n")
	plotmatrix(t(SOME.CONF.RESULTS2), grlab="k=", xlab = "n", ylab = "Scope for fishing", main="Fishing by Adding a Covariate \n Given k to choose from", ylim = c(0,1))	
	plotmatrix(t(SOME.FISH.RESULTS2), grlab="k=", xlab = "n", ylab = "Scope for fishing", main="Fishing by Adding Up to n-3  Covariates \n Given k to choose from", ylim = c(0,1))	
	plotmatrix(t(SOME.HET.RESULTS), grlab="k=", xlab = "n", ylab = "Scope for fishing", main="Fishing with Heterogeneous Treatment Effects \n Given k arbitrary ways to split the cases", ylim = c(0,1))	
	plotmatrix(t(SOME.HET.RESULTS2), grlab="k=", xlab = "n", ylab = "Scope for fishing", main="Fishing with Heterogeneous Treatment Effects \n Given k ways to split the cases in half", ylim = c(0,1))	
	plot(as.numeric(colnames(SOME.CUTS.RESULTS2)), SOME.CUTS.RESULTS2, xlab = "n", ylab = "Scope for fishing", main="Fishing by Dichotomizing the Outcome Variable \n Given (n-1) possible cut points", type="l", ylim = c(0,1))	
	plot(as.numeric(colnames(SOME.MOD.RESULTS)), SOME.MOD.RESULTS, xlab = "n", ylab = "Scope for fishing", main="Fishing by Selection of Model \n (LPM, Logit,  Probit)", type="l", ylim = c(0,1))	
dev.off()


